2  Expression data analysis

Author

Izar de Villasante

::: {.panel-tabset}

3 Loading data:

3.1 Sample sheet

table(ss[,Condition,by=Type])
      Condition
Type   cell_line metastasis primary tumour
  ARMS         2          2       3      2
  NBL          0          2       3      3
dtable(ss)

3.2 Annotation:

3.2.1 Affymetrix default:

The added file will be used as starting point to annotate CalriomD array.

library(data.table)
anno <- data.table::fread(params$array_annotation)
names(anno) <- make.names(colnames(anno))
# anno[,rn:=Probe.Set.ID]
str(anno)
Classes 'data.table' and 'data.frame':  134748 obs. of  3 variables:
 $ Probe.Set.ID: chr  "TC0100007786.hg.1" "TC1600011519.hg.1" "TC1600011520.hg.1" "TC0800012474.hg.1" ...
 $ Gene.Symbol : chr  "AGO1" "SEPT1" "SEPT1" "AGO2" ...
 $ Gene.Title  : chr  "argonaute RISC catalytic component 1" "septin 1" "septin 1" "argonaute RISC catalytic component 2" ...
 - attr(*, ".internal.selfref")=<externalptr> 

The gene symbols come from different origin. Thus, we don’t know how many of them will match with the methylation known genes UCSC_RefGene_Name

library(IlluminaHumanMethylationEPICanno.ilm10b2.hg19)
known.genes<-IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Other$UCSC_RefGene_Name
known.genes<-unique(unlist(sapply(known.genes,function(x)unlist(strsplit(x,";")))))
saveRDS(known.genes, "data/known.genes.RDS")

We can check how many candidate genes we have some methylation data availabe from the EPIC array:

length(known.genes)
[1] 27369

We can now check how many probes from the ClariomD panel map to those same genes:

table(anno$Gene.Symbol %in% known.genes)

 FALSE   TRUE 
108769  25979 
candidate_genes <- unique(intersect(unique(anno$Gene.Symbol),known.genes))

25979 probes out of 134748 in the array will provide useful information that we can match with methylation data. That is a bit less than 20% of the array. In contrast, 88% of the genes present in the methylation array annotation are also present here.

One option would be that ClariomD has much more genes in it’s annotation than the EPIC arrays. Let’s check on that:

library(clariomdhumantranscriptcluster.db)
Loading required package: AnnotationDbi
Loading required package: stats4
Loading required package: BiocGenerics

Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min
Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: IRanges
Loading required package: S4Vectors

Attaching package: 'S4Vectors'
The following objects are masked from 'package:data.table':

    first, second
The following objects are masked from 'package:base':

    expand.grid, I, unname

Attaching package: 'IRanges'
The following object is masked from 'package:data.table':

    shift
Loading required package: org.Hs.eg.db
select(clariomdhumantranscriptcluster.db, keys=keys(clariomdhumantranscriptcluster.db),columns = c("PROBEID","SYMBOL"))-> clariom_gene_symbol
'select()' returned 1:many mapping between keys and columns
clariom_gene_symbol <- clariom_gene_symbol[!is.na(clariom_gene_symbol$SYMBOL),]
length(unique(clariom_gene_symbol$SYMBOL))
[1] 26284

In summary:

su <- data.table(
  # Probes in expression array:
  N_probes_array=NROW(anno),
  # Probes in expression array with associated gene symbols 
  N_probes_gene_exp=paste0(
    length(unique(clariom_gene_symbol$PROBEID)),
    " (",
    round(length(unique(clariom_gene_symbol$PROBEID))/NROW(anno)*100),
    "%)"
    ),
  # Probes in expression array with associated gene symbols that are also associated to methylation EPIC array probes
  N_probes_gene_exp_meth =paste0(
    length(unique(clariom_gene_symbol[clariom_gene_symbol$SYMBOL %in% known.genes,"PROBEID"])),
    " (",
    round(length(unique(clariom_gene_symbol[clariom_gene_symbol$SYMBOL %in% known.genes,"PROBEID"]))/NROW(anno)*100),
    "%)"
    ),
  # List of genes associated to EPIC methylation array
  N_genes_meth=length(known.genes),
  # List of genesassociated to ClariomD expression array
  N_genes_exp=length(unique(clariom_gene_symbol$SYMBOL)),
  # List of genes associated both to methylation and expression arrrays
  N_genes_exp_meth= paste0(
    length(unique(intersect(clariom_gene_symbol$SYMBOL,known.genes))),
    " (",
    round(length(unique(intersect(clariom_gene_symbol$SYMBOL,known.genes)))/length(unique(clariom_gene_symbol$SYMBOL))*100),
    "%)"
    )
)

su    
   N_probes_array N_probes_gene_exp N_probes_gene_exp_meth N_genes_meth
1:         134748       25570 (19%)            22140 (16%)        27369
   N_genes_exp N_genes_exp_meth
1:       26284      22695 (86%)

The clariomD expression array contains 134748 probes from which 25570 (19%) probes are associated to 26284 different known genes (UCSC symbol). The EPIC methylation array has annotation associated to 27369 genes. In order to compare Differentially methylated genes (DMG) with differentially expressed genes (DEGs) we match the two annotation datasets by overlaping the genes (UCSC symbol) that are associated both to expression and methylation. There is a total of su$N_genes_exp_meth candidate genes that overlap which correspond to su$N_probes_gene_exp_meth probesets from the ClariomD expression array.

A possible explanation is that the Clariom D array is intended to have what Affy calls ‘deep content’, unfortunately much of which isn’t assocaited with any annotated genes. This is why of the ~139K probesets on that array, only su$N_probes_gene_exp have an associated gene (SYMBOL/UCSC_RefGene_Name). There are probably many reasons for that, but the primary reason is that Affy put a bunch of speculative content on the array which remains speculative to this day.

3.2.2 MBNI re-map

The MBNI group at Michigan do a re-mapping of the probesets where they just pretend Affy never did any annotating, and they take all the probes on the array, align to the genome and then throw out all the probes that don’t align to a unique genomic position. They then take the remaining probes and count up all those that align to a known gene location, and put them in new probesets that are just one probeset per gene. They take the 8.1M probes on this array and throw out 5.7M of them, and only use the remaining 29% of the probes to do that.

James W. Mac Donald https://support.bioconductor.org/p/126059/

install.packages("http://mbni.org/customcdf/25.0.0/entrezg.download/clariomdhumanhsentrezg.db_25.0.0.tar.gz")
install.packages("http://mbni.org/customcdf/25.0.0/entrezg.download/pd.clariomdhuman.hs.entrezg_25.0.0.tar.gz")

Here we must start from the .CEL files to map the probes to the new probsets defined in pd.clariomdhuman.hs.entrezg

library(annotate)
library(pdInfoBuilder)
library(pd.clariomdhuman.hs.entrezg)
library(clariomdhumanhsentrezg.db)
MBNI_gene_symbol <- AnnotationDbi::select(clariomdhumanhsentrezg.db,keys=keys(clariomdhumanhsentrezg.db),columns=c("SYMBOL","PROBEID"))
su2 <- data.table(
  N_probes_array=NROW(anno),
  N_probes_gene_exp=paste0(
    length(unique(MBNI_gene_symbol$PROBEID)),
    " (",
    round(length(unique(MBNI_gene_symbol$PROBEID))/NROW(anno)*100),
    "%)"
    ),
  N_probes_gene_exp_meth =paste0(
    length(unique(MBNI_gene_symbol[MBNI_gene_symbol$SYMBOL %in% known.genes,"PROBEID"])),
    " (",
    round(length(unique(MBNI_gene_symbol[MBNI_gene_symbol$SYMBOL %in% known.genes,"PROBEID"]))/NROW(anno)*100),
    "%)"
    ),
  N_genes_meth=length(known.genes),
  N_genes_exp=length(unique(MBNI_gene_symbol$SYMBOL)),
  
  N_genes_exp_meth= paste0(
    length(unique(intersect(MBNI_gene_symbol$SYMBOL,known.genes))),
    " (",
    round(length(unique(intersect(MBNI_gene_symbol$SYMBOL,known.genes)))/length(unique(MBNI_gene_symbol$SYMBOL))*100),
    "%)"
    )
)

su2    

3.2.3 Ewig sarcoma

We can use the super-enhancers dataset for high levels of EWS-FLI1 protein from the paper (Tomazou2015?):

superenh_anno <- data.table::fread("https://medical-epigenomics.org/papers/tomazou2015/resultTables/Super-enhancersEWS-FLI1high.bed")

3.3 Expression:

3.3.1 Counts:

namedlist <- function(...) {
  nl <- list(...)
  argnames <- sys.call()
  n<-sapply(argnames[-1], as.character)
  names(nl)<-n


  return(nl)
  }

library(readxl)
library(data.table)
counts<-list()
#NBL:
NBL_raw <- readxl::read_xlsx("raw/20230712_OMartinez_ARMSNBL/DEG/Norm_NBL_values.xlsx", sheet = 1)
NBL_raw <- unique(NBL_raw[,!colnames(NBL_raw)%in%c("Gene.Symbol","Gene.Title")])
NBL<-merge(anno,NBL_raw,by="Probe.Set.ID")
#ARMS:
ARMS_raw <- readxl::read_xlsx("raw/20230712_OMartinez_ARMSNBL/DEG/Norm-RMS-merged.xlsx", sheet = 1)
ARMS_raw <- ARMS_raw[,!colnames(ARMS_raw)%in%c("Gene.Symbol","Gene.Title")]
ARMS<-merge(anno,ARMS_raw,all.y=T,by="Probe.Set.ID")
counts <-namedlist(NBL,ARMS)
str(counts)
List of 2
 $ NBL :Classes 'data.table' and 'data.frame':  134748 obs. of  34 variables:
  ..$ Probe.Set.ID: chr [1:134748] "TC0100006432.hg.1" "TC0100006433.hg.1" "TC0100006434.hg.1" "TC0100006435.hg.1" ...
  ..$ Gene.Symbol : chr [1:134748] "DDX11L1" "spopoybu" "MIR1302-10" "OR4G4P" ...
  ..$ Gene.Title  : chr [1:134748] "DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1" "Transcript Identified by AceView" "microRNA 1302-10" "olfactory receptor, family 4, subfamily G, member 4 pseudogene" ...
  ..$ rn          : chr [1:134748] "15568" "122807" "36412" "91829" ...
  ..$ NB5_1MP     : num [1:134748] 4.68 6.28 5.06 3.27 3.92 ...
  ..$ NB5_1TP     : num [1:134748] 4.92 5.88 5.17 2.82 4.11 ...
  ..$ NB5_3MP     : num [1:134748] 4.85 6.66 5.24 3.51 4.12 ...
  ..$ NB5_3TP     : num [1:134748] 4.83 6.21 4.99 3.37 4.27 ...
  ..$ NB5_5MP     : num [1:134748] 4.73 5.82 4.52 3.4 4.25 ...
  ..$ NB5_5TP     : num [1:134748] 5.1 6 5.15 3.4 3.62 ...
  ..$ NB5_6MP     : num [1:134748] 4.64 6.66 4.93 3.14 3.81 ...
  ..$ NB5_6TP     : num [1:134748] 4.55 6.2 4.96 2.9 3.91 ...
  ..$ NB5_9MP     : num [1:134748] 4.12 6.09 4.9 3.34 3.67 ...
  ..$ NB5_9TP     : num [1:134748] 4.89 6.14 4.89 3.41 3.39 ...
  ..$ NB16_2MP    : num [1:134748] 4.8 6 5.43 3.15 4.42 ...
  ..$ NB16_2TP    : num [1:134748] 5.02 6.19 4.99 3.42 3.87 ...
  ..$ NB16_3MP    : num [1:134748] 4.84 6.17 4.95 3.28 3.84 ...
  ..$ NB16_3TP    : num [1:134748] 4.48 5.94 4.83 3.68 3.77 ...
  ..$ NB16_6TP    : num [1:134748] 5.05 6.1 5.18 3.18 4 ...
  ..$ NB16_6MP    : num [1:134748] 4.57 6.35 5.03 3.6 3.74 ...
  ..$ NB16_7TP    : num [1:134748] 4.91 6.17 4.98 3.29 4.04 ...
  ..$ NB16_8TP    : num [1:134748] 4.96 6.17 5.09 2.97 4.04 ...
  ..$ NB16_9MP    : num [1:134748] 4.68 6.12 5.09 3.82 3.93 ...
  ..$ NB16_9TP    : num [1:134748] 4.94 6.25 4.87 3.44 3.52 ...
  ..$ NB5_1       : num [1:134748] 4.63 5.67 4.57 3.54 3.61 ...
  ..$ NB5_2       : num [1:134748] 4.69 5.78 4.94 3.31 3.73 ...
  ..$ NB5_3       : num [1:134748] 4.61 6.13 5.86 2.63 4.11 ...
  ..$ NB5_4       : num [1:134748] 5.1 5.86 5.43 3.15 3.99 ...
  ..$ NB5_5       : num [1:134748] 5.07 6.55 4.77 3.16 3.86 ...
  ..$ NB16_1      : num [1:134748] 4.35 6.31 4.85 3.21 4.45 ...
  ..$ NB16_2      : num [1:134748] 4.67 6.19 4.63 3.17 4.34 ...
  ..$ NB16_3      : num [1:134748] 4.47 5.88 4.97 3.15 3.57 ...
  ..$ NB16_4      : num [1:134748] 4.7 6.32 4.94 2.88 4.12 ...
  ..$ NB16_5      : num [1:134748] 4.3 6.19 5.12 3.18 4.05 ...
  ..- attr(*, ".internal.selfref")=<externalptr> 
  ..- attr(*, "sorted")= chr "Probe.Set.ID"
 $ ARMS:Classes 'data.table' and 'data.frame':  134748 obs. of  16 variables:
  ..$ Probe.Set.ID: chr [1:134748] "TC0100007786.hg.1" "TC0X00010643.hg.1" "TC0900011072.hg.1" "TC0200007996.hg.1" ...
  ..$ Gene.Symbol : chr [1:134748] "AGO1" "SEPT6" "ABCA1" "AC007881.4" ...
  ..$ Gene.Title  : chr [1:134748] "argonaute RISC catalytic component 1" "septin 6" "ATP binding cassette subfamily A member 1" NA ...
  ..$ rn          : chr [1:134748] "1" "10" "100" "1000" ...
  ..$ 1T          : num [1:134748] 9.46 9.45 5.91 7.3 7.11 ...
  ..$ 1M          : num [1:134748] 9.78 9.29 6.28 6.96 7.5 ...
  ..$ 2T          : num [1:134748] 9.24 9.19 5.69 7.15 7.24 ...
  ..$ 2M          : num [1:134748] 9.62 9.27 5.41 7.22 7.68 ...
  ..$ 3T          : num [1:134748] 9.07 9.06 6.06 7 7.24 ...
  ..$ 3M          : num [1:134748] 9.5 9.48 5.62 7.32 7.59 ...
  ..$ RH.1        : num [1:134748] 9.6 9.82 5.61 7.77 7.93 ...
  ..$ RH.2        : num [1:134748] 9.58 9.94 5.94 7.72 7.87 ...
  ..$ RH.3        : num [1:134748] 9.73 10.22 5.81 7.74 7.81 ...
  ..$ N1.1        : num [1:134748] 9.94 10 5.81 7.55 7.73 ...
  ..$ N1.2        : num [1:134748] 10.03 9.86 5.66 7.47 7.62 ...
  ..$ N1.3        : num [1:134748] 10.18 10.21 5.95 7.61 7.71 ...
  ..- attr(*, ".internal.selfref")=<externalptr> 
  ..- attr(*, "index")= int(0) 
  .. ..- attr(*, "__Probe.Set.ID")= int [1:134748] 40938 25345 64099 125672 125669 125664 23338 67697 6624 60703 ...

In order to have the same contrasts as in the methylation analysis we create the following table:

values <- tibble::tibble( # Use all possible combinations of input settings.

  group_var = "Sample_Group",
  Contrasts = c(paste(c("Sample_Grouptumour_ARMS-Sample_Groupmetastasis_ARMS", #T1 vs T2 -> mice
                        "Sample_Groupcell_line_ARMS-Sample_Groupprimary_ARMS", #T0 vs T3 -> start end
                        "Sample_Groupcell_line_ARMS-Sample_Grouptumour_ARMS",  #T0 vs T1 -> tumor
                        "Sample_Groupmetastasis_ARMS-Sample_Groupprimary_ARMS",  #T2 vs T3 -> metastasi
                        "Sample_Groupcell_line_ARMS-(Sample_Groupprimary_ARMS+Sample_Grouptumour_ARMS+Sample_Groupmetastasis_ARMS)/3",  #T0 vs T1&T2&T3
                        "(Sample_Groupprimary_ARMS+Sample_Groupcell_line_ARMS)/2-(Sample_Grouptumour_ARMS+Sample_Groupmetastasis_ARMS)/2"  #T0&T1 vs T2&T3
                        ),collapse = ";"
                      ),
                paste(c("Sample_Groupprimary_NBL-Sample_Grouptumour_NBL", #T0 vs T1 -> tumor
                        "Sample_Grouptumour_NBL-Sample_Groupmetastasis_NBL", #T1 vs T2 -> mice
                        "(Sample_Groupprimary_NBL+Sample_Grouptumour_NBL)/2-Sample_Groupmetastasis_NBL",  #T0&T1 vs T2 -> tumor vs metastasi
                        "Sample_Groupprimary_NBL-(Sample_Grouptumour_NBL+Sample_Groupmetastasis_NBL)/2" #T0 vs T1&T2 -> culture vs Tissue
                        ),collapse = ";"
                      ),
                paste(c("Sample_Grouptumour_ARMS-Sample_Grouptumour_NBL", # T1 -> tumors
                       "Sample_Groupmetastasis_ARMS-Sample_Groupmetastasis_NBL", #T2 -> metastasi
                       "Sample_Groupcell_line_ARMS-Sample_Groupprimary_NBL",  # T0 CL
                       "(Sample_Groupcell_line_ARMS+Sample_Groupprimary_NBL)/2-(Sample_Grouptumour_ARMS+Sample_Groupmetastasis_ARMS+Sample_Grouptumour_NBL+Sample_Groupmetastasis_NBL)/4" #T0 vs T1&T2 -> culture vs Tissue

                       ),collapse = ";"
                )
  ),
                
  Contrast_names=list(c("At1-At2:tissue","At0-At3:start/end","At0-At1:tumors","At2-At3:metastasis","At0-All:Cell_line","At0At1 - At2At3:TuMet"),
                   c("Nt0-Nt1:tumor","N1-Nt2:tissue","Nt0+Nt1-Nt2:tumor/metastasi","Nt0-All:culture/tissue"),
                   c("At1-Nt1:tumors","At2-Nt2:metastasi","At0-Nt0:Cultures","culture vs tissue")
                   )
)
library(data.table)
ARMS<-counts[["ARMS"]]
ARMS_ids<-intersect(ss$Exp_SID, colnames(ARMS))
ARMS[,rn:=as.numeric(rownames(ARMS))]
anno_ids<- c("rn",colnames(anno))
setkey(ss,Exp_SID)
ARMS_exp.mat<-as.matrix(ARMS[,.SD,.SDcols=ARMS_ids])
ARMS_ss <- ss[ARMS_ids,]
ARMS.model <- mod(object = ARMS_exp.mat,
                  group_var = values$group_var[1],
                  contrasts = values$Contrasts[1],
                  covs= NULL ,
                  metadata = ARMS_ss,
                  idcol="Exp_SID",
                  rename=values$Contrast_names[[1]])
DEGs <- DMPextr(fit=ARMS.model,p.value = 0.05, beta_normalized = ARMS[,.SD,.SDcols=ARMS_ids],mDiff = 0.000001,ann=ARMS[,.SD,.SDcols=anno_ids],writeOut = F)
DEGs$Type<-ifelse(DEGs$logFC>0,"Hyper","Hypo")
apply_filter_dmps(dmps = DEGs,p.value = seq(0.00,0.05,0.01),mDiff = seq(0,4,.5),path = "analysis/DE")

pvals

DEGs[,diffexpressed:=ifelse(abs(logFC)<2,"NONE",ifelse(logFC<0,"DOWN","UP")) ]
vp<-vulcanos(data=DEGs,x="logFC",y="adj.P.Val",col="diffexpressed",label="Gene.Symbol",labSize = 2,max.overlaps = 100)
ggsave(filename = "DE_vulcano_plots.png",plot = vp,device = "png",path =  "analysis/DE",dpi = "retina" )

vulcanos

3.4 Methylation:

A previous methylation analysis was performed. You can check the results here:

3.4.1 DMRS:

dmrs<-as.data.table(readRDS("data/dmrs1_ARMS.rds"))
dmrs<-dmrs[!is.na(overlapping.genes),]
ano_cols<-c("seqnames","start","end","width","no.cpgs","HMFDR","meandiff","Contrast")
dt_list<- lapply(
  1:NROW(dmrs),function(rn){  # For every position or ProbeID do:
    UCSC <-       # 1. generate a data.table (per row)
      dmrs[rn,       # 2. split UCSC gene and position
               lapply(       # 3. new row for every combination of gene/position
                 overlapping.genes,        # 4. add Contrast porbeID and bval difference
                 function(x)unlist(strsplit(x,","))
                 )
               ,by=ano_cols]
      
    })
dt_dmrs <- rbindlist(dt_list)
DMRs_ARMS <- dt_dmrs[,Gene:=V1][!is.na(V1),]
DMRs_ARMS$V1 <- NULL

saveRDS(DMRs_ARMS,"data/DMRs_ARMS.rds")
DMRs<-DMRs_ARMS
dtable(DMRs)
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

4 Correlation:

4.1 DMRs vs expression

In first place let’s see if there is any overlap between Differentially methylated genes (DMGs) and Differentially Expressed Genes (DEGs):

DEGs[,Gene:=Gene.Symbol]
dt_DEG_DMR <- merge(DMRs_ARMS,DEGs,by=c("Gene","Contrast"),all = T)
tab <- dt_DEG_DMR[,.(DEGs=sum(!is.na(Probe.Set.ID)),DMRs=sum(!is.na(start)),overlap=sum(!(is.na(start)) &!(is.na(Probe.Set.ID)) )),by=c("Contrast")]
kableExtra::kbl(tab)|>kableExtra::kable_classic_2()
Contrast DEGs DMRs overlap
At0-All:Cell_line 3201 6650 160
At0-At1:tumors 4800 5781 214
At0-At3:start/end 147 7494 8
At0At1 - At2At3:TuMet 11213 1517 123
At2-At3:metastasis 3043 1046 25
At1-At2:tissue 0 1 0

In the table above there is one row for each DMG associated to a DMR, so there may be more ### Venn diagrams:

library(gridExtra)
library(venn)
library(ggplot2)
library(ggpolypath)
library(grid)
dt_venn <- dt_DEG_DMR[!is.na(Gene.Symbol)&!is.na(start),Gene,by=Contrast]
genelist <- unique(dt_venn$Gene)
gl <- as.data.frame(lapply(unique(dt_venn$Contrast),function(x)genelist %in% dt_venn[Contrast==x,Gene]))
names(gl)<-unique(dt_venn$Contrast)
ARMS_venn.plot <- venn::venn(gl,ilabels=TRUE, zcolor = "style", ggplot=T, box=F)
# ARMS_euler.plot <- plot(euler(gl, shape = 'ellipse'),quantities=T)
# ARMS_euler.plot
ARMS_venn <- grid.arrange(ARMS_venn.plot,
             top=grid::textGrob("ARMS: Differentially Expressed & Differentially Methylated Genes",gp=grid::gpar(fontsize=14,font=3))
)

ggsave(filename = "ARMS_VENN.png",plot = ARMS_venn,device = "png",path =  "analysis/DE",dpi = "retina" )

VENN_ARMS

4.1.1 DEG/DMR overlapping Correlations:

Now we want to get the mean beta value for each sample in the dmr location: 1. Find all cpg that fall in the dmr 2. get the mean for each sample 3. Match each sample bVal with it’s expression value. (No way to do it, since there is no key sample to sample) 4. Make correlation 5. Plot.

ol <- dt_DEG_DMR[!is.na(Gene.Symbol)&!is.na(start),]
library(data.table)
library(GenomicRanges)
library(IlluminaHumanMethylationEPICanno.ilm10b2.hg19)
betas <-as.data.table(readRDS("data/betas_ARMS.rds"),keep.rownames = "ProbeID")
locs <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Locations
locs.ranges <- GRanges(locs$chr, IRanges(locs$pos, locs$pos))
names(locs.ranges) <- rownames(locs)
# locs.ranges$gene_element <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Other$UCSC_RefGene_Group
# locs.ranges$gene <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Other$UCSC_RefGene_Name
mcols(locs.ranges)<-IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Other
dt_DEG_DMR_intersect<-lapply(1:NROW(ol),function(nr){
  cont<-ol[nr,Contrast]
  xpr <- counts[["ARMS"]][Probe.Set.ID==ol[nr,Probe.Set.ID]]
  xpr<-xpr[,.SD,.SDcols=names(xpr)[!(names(xpr)%in%c("rn",names(anno)))]]
  expression <- suppressWarnings(melt(xpr,measure.vars=colnames(xpr),variable.name = "SID",value.name = "TPM" ))
  SIDs <- ss[Exp_SID %in% as.character(expression$SID),barcode]
  cgs <- names(subsetByOverlaps(locs.ranges,GRanges(ol[nr,])))

  #### Annotate DMRs with genomic location:
  
  ann<-as.data.frame(mcols( subsetByOverlaps(locs.ranges,GRanges(ol[nr,]))))|>as.data.table(keep.rownames="ProbeID")

  # Possibility of multiple genes and genomic location per probe
  genomic<-lapply(1:NROW(ann),
                  function(cg){
                    # For every position or ProbeID do:
                                          # 1. generate a data.table (per row)
                    UCSC_genes <-ann[cg,        # 2. split UCSC gene and position
                        lapply(           # 3. new row for every combination of gene/position
                               .SD,       # 4. add Contrast porbeID and bval difference
                               function(x)unlist(strsplit(x,";"))
                               )
                        ,.SDcols=c("UCSC_RefGene_Group","UCSC_RefGene_Name")]
                    enhancers<-ann[cg,.SD,.SDcols=unlist(sapply(c("Enhancer","OpenChromatin","TFBS"),function(n) colnames(ann)[colnames(ann) %like% n]))]
                    return(cbind(UCSC_genes,enhancers))
                  }
                  
                  )
  # 5. rbind all together:
  genomic_loc <- rbindlist(genomic)|> unique() # should get gene annotation from here to be fair
  ge <- factor(genomic_loc$UCSC_RefGene_Group,levels = c("3'UTR","ExonBnd", "1stExon","5'UTR","Body","TSS200","TSS1500"),ordered=T  )
  gene_location <- ifelse(startsWith( as.character(max(ge)),"TSS" ),"promoter","intragenic")
  
  annot <- genomic_loc[,lapply(.SD,function(x)paste(unique(x[nchar(x)>0]),collapse = ";"))]
  annot$gene_location <- gene_location
  ####
  
  b<-betas[ProbeID %in% cgs ,lapply(.SD,function(x)mean(x,na.rm=T)),.SDcols=SIDs]
  meth <- suppressWarnings( melt(b,measure.vars=colnames(b),variable.name = "barcode",value.name = "DMR.meth" ))
  meth$SID <- ss[meth,Exp_SID,on="barcode"]
  # list(meth,expression)
  m <- merge(meth,expression,by="SID")
  m <- m[,lapply(.SD,as.numeric),by=SID]
  dt<-cbind(ol[nr,],m)
  cor<-cor.test(dt$DMR.meth,dt$TPM)
  dt$correlation <- cor$estimate
  dt$cor.pval <- cor$p.value
  dt <- cbind(dt,annot)
  dt$barcode=nr
  return(dt)
  })|>rbindlist()
dt_cor <-dt_DEG_DMR_intersect
dt_cor$Condition <-ss[dt_DEG_DMR_intersect,"Condition",on="SID"]
dt_cor$Contrast <- as.factor(dt_cor$Contrast)
dt_cor$SID <- as.factor(dt_cor$SID)
reg_element_names<-unlist(sapply(c("Enhancer","OpenChromatin","TFBS"),function(n) colnames(dt_cor)[colnames(dt_cor) %like% n]))
reg <- dt_cor[,sapply(.SD,nchar)>0,.SDcols=reg_element_names   ]
dt_cor$regulatory <- apply(reg,1,function(x)paste(colnames(reg)[x],collapse=";"))
setorder(dt_cor,cor.pval,Gene)
saveRDS(dt_cor,"data/dt_cor.rds")
reg_element_names<-unlist(sapply(c("Enhancer","OpenChromatin","TFBS"),function(n) colnames(dt_cor)[colnames(dt_cor) %like% n]))

4.1.1.1 Table for DMR & DEG:

In the following table the values for methylation and expression are integrated:

dtable(dt_cor[,.SD,.SDcols=c("Gene","Contrast","SID","DMR.meth","TPM","correlation","cor.pval")])

4.1.2 Plots

library(ggplot2)
library(data.table)
library(ggpubr)
library(ggrepel)
DMR_DEG_plots <- list()
plots_folder <- "analysis/Integration_with_expression/DEGvsDMR/pval"
dir.create(plots_folder,recursive = T,showWarnings = F)
setkeyv(dt_cor,c("barcode"))

for(cont in unique(dt_cor$Contrast)){
  
  ID<-dt_cor[Contrast==cont,head(unique(barcode),10)]
  plist <- lapply(ID,function(nr){
    pdata <- dt_cor[barcode==nr,]
    plot_name <- paste0("/DEG_DMR.pval_", unlist(unique(dt_cor[barcode==nr,Gene]))[1], ".png")
    plt_path <- paste(plots_folder, cont, sep = .Platform$file.sep)
    dir.create(plt_path,recursive = T,showWarnings = F)
    p<-ggplot(data=pdata, aes(x=as.numeric(TPM),y=as.numeric(DMR.meth),colour=SID)) +
    geom_point(aes(shape=Condition),size=8,alpha=0.6) +
    xlab("Mean Gene methylation (DMR)") + ylab("Mean Gene Expression (Probeset)") +

    geom_smooth(method = "lm",
                inherit.aes = F,
                aes(x=as.numeric(pdata$TPM),y=as.numeric(pdata$DMR.meth)),
                linetype="dashed",
                se=F,
                formula = "y ~ x")+

    annotate(geom = "text",
             x = -Inf, y = Inf,
             label=paste0("correlation: ", round(unique(pdata$correlation),4), ", p-value: ", round(unique(pdata$cor.pval),8)),
             hjust = 0, vjust = 1
             )  +

    ggtitle(paste0(" Correlation between expression and methylation for gene: ",unique(pdata$Gene)))
    ggsave(path = plt_path, filename = plot_name, plot=p)
    return(p)
  })
  DMR_DEG_plots[[cont]] <- plist
  }
saveRDS(DMR_DEG_plots,"data/DMR_DEG_plots.rds")
# FUN <- function(cont) cat("\n\n#### Top Correlation plots for contrast ", cont, "\n
# ```{r}
# #| column: screen-inset-right
# #| layout-ncol: 4
# #| results: hide
# #| fig-height: 4
# #| fig-keep: all
#   print(DMR_DEG_plots[[",cont,"]][1:10])
# ```\n")
# 

# invisible(sapply(values$Contrast_names[[1]],FUN))
DMR_DEG_plots

5 Pathway analysis

The pathways analysis will be performed analysing 2 sets of genes: 1. Full DMR/DMPs: the full set of significant diferentially methylated & expressed genes. 2. Cor.sig: A subset of the above where the correlation p.value is smaller than 0.05

5.1 DNA methylation context:

Although the relationship between DNA methylation and gene expression is complex and the mechanisms involved in gene regulation by DNA methylation are diverse. Fortunately, over the years researchers have found some common relationships between gene methylation and gene expression:

DNA methylation is an epigenetic mark that has been traditionally associated with gene silencing, specially when methylation happens on promoter regions. DNA methylation is related with the repressed chromatin state which blocks the access of transcription factors to promoter regions. Thus, silencing promoter activity in a stable way and reducing transcription Suzuki and Bird (2008).

In the other hand, the bodies of active genes in plants and animals are often heavily methylated. Suzuki and Bird (2008)

Therefore, usually it is expected to find most of the genes to have negative correlations between DNA methylation and gene transcription in promoter regions while some of the genes might present positive correlations specially in intra-genic regions.

Now, let’s inspect our data, the following table shows a summary of the paired DMGs and DEGs:

tab.full <- dt_cor[,.(
  negative.cor=sum(unique(correlation)<0),
  positive.cor=sum(unique(correlation)>0),
  Regulatory.elements=sum(nchar(unique(regulatory))>0),
  Total=sum(unique(correlation)!=0)),
  by=c("Contrast","barcode")]
tab_summary <-tab.full[,lapply(.SD,sum),by=Contrast,.SDcols=c("Regulatory.elements","negative.cor","positive.cor","Total")]
kableExtra::kbl(tab_summary)|>kableExtra::kable_classic_2()
Contrast Regulatory.elements negative.cor positive.cor Total
At0-All:Cell_line 205 100 157 257
At0-At1:tumors 326 156 243 399
At0-At3:start/end 24 20 12 32
At0At1 - At2At3:TuMet 130 57 115 172
At2-At3:metastasis 64 52 41 93

Here you can see a summary table of the Total ammount of genes that are both Differentially Methylated and Differentially Expressed. For this to yield true, there has to be at least one DMR, composed by at least 3 probes (cpg) from the methylation microarray that fall in the same region of the genome as well as a significant difference in expression associated to the same gene: - Regulatory.elements: Genes that have a DMR associated to any of the following regulatory elements: r paste(reg_element_names, collapse=", ") - negative.cor: Total number of negatively correlated genes where methylation and expression work in opposite directions. - positive.cor: Total number of positively correlated genes where methylation and expression go in the same direction. - Total ammount of genes that are both DE and DM.

dtable(dt_cor[,.SD,.SDcols=c("Gene","Contrast","no.cpgs","width","SID","DMR.meth","TPM","correlation","cor.pval", "gene_location",reg_element_names)])
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

5.2 Full

This dataset is not filtered by the strength of the correlation between DMRs and DEGs.

5.2.1 Pathways

path_results<-function(pathway,topN=50,method="method",pval=0.05,path="results/pathways.csv",cols=NULL){
  # pathway<-pathway[FDR<1,]
  pathway$method<-pathway[[method]]
  data.table::setorder(pathway,method,FDR)
  sig_idx <- pathway[,.I[FDR < pval]  ,by=method]$V1
  head_idx<-pathway[,.I[1:min(..topN,.N)],by=c(method,"Contrast")]$V1
  res<-pathway[base::union(sig_idx,head_idx),]
  # res[,TERM:=ifelse(FDR<pval,paste0("*** ",TERM," ***"),TERM)]
  # data.table::fwrite(res,path)
  results<-res[,.SD,.SDcols=c("Contrast","FDR",cols,"TERM","method")]
  data.table::setorder(results,Contrast,method,FDR)
  return(results)
}

library(gprofiler2)
pathways_full <- lapply(unique(dt_cor$Contrast),function(cont){
  library(gprofiler2)
  library(data.table)
   p<-gprofiler2::gost(signif = T ,unique(dt_cor[Contrast==cont,Gene]))[[1]]
   if(!is.null(p)){
    dth<- data.table::as.data.table(p)
    dth[,FDR:=p_value]
    dth[,TERM:=term_name]
    dth[,source:=factor(source)]
    dth[,Contrast:=cont]
   }
  })
Pathway_Full<-path_results(rbindlist(pathways_full),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full,"data/pathway_full.rds")
dtable(Pathway_Full)

As you can see the terms are rather general. Let’s try to filter based on the direction of correlation + / -:

5.2.1.1 Positive:

dt_pos <- dt_cor[correlation>0,]
pathways_full_positive <- lapply(unique(dt_pos$Contrast),function(cont){
  library(gprofiler2)
  library(data.table)
   p<-gprofiler2::gost(signif = T ,unique(dt_pos[Contrast==cont,Gene]))[[1]]
   if(!is.null(p)){
    dth<- data.table::as.data.table(p)
    dth[,FDR:=p_value]
    dth[,TERM:=term_name]
    dth[,source:=factor(source)]
    dth[,Contrast:=cont]
   }
  })
No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_Pos<-path_results(rbindlist(pathways_full_positive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_Pos,"data/pathway_full_positive.rds")
dtable(Pathway_Full_Pos)

5.2.1.2 Negative:

dt_neg <- dt_cor[correlation<0,]
pathways_full_negitive <- lapply(unique(dt_neg$Contrast),function(cont){
  library(gprofiler2)
  library(data.table)
   p<-gprofiler2::gost(signif = T ,unique(dt_neg[Contrast==cont,Gene]))[[1]]
   if(!is.null(p)){
    dth<- data.table::as.data.table(p)
    dth[,FDR:=p_value]
    dth[,TERM:=term_name]
    dth[,source:=factor(source)]
    dth[,Contrast:=cont]
   }
  })
No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_neg<-path_results(rbindlist(pathways_full_negitive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_neg,"data/pathway_full_negative.rds")
dtable(Pathway_Full_neg)

5.3 cor.sig

This dataset is not filtered by the strength of the correlation between DMRs and DEGs.

5.3.1 Pathways

dt.sig <-  dt_cor[cor.pval<0.05 ,.(correlation=unique(correlation)),by=c("Contrast","Gene","width")]


library(gprofiler2)
pathways_full <- lapply(unique(dt.sig$Contrast),function(cont){
  library(gprofiler2)
  library(data.table)
   p<-gprofiler2::gost(signif = T ,unique(dt.sig[Contrast==cont,Gene]))[[1]]
   if(!is.null(p)){
    dth<- data.table::as.data.table(p)
    dth[,FDR:=p_value]
    dth[,TERM:=term_name]
    dth[,source:=factor(source)]
    dth[,Contrast:=cont]
   }
  })
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full<-path_results(rbindlist(pathways_full),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full,"data/pathway_full_sig.rds")
dtable(Pathway_Full)

As you can see the terms are rather general. Let’s try to filter based on the direction of correlation + / -:

5.3.1.1 Positive:

dt_pos <- dt.sig[correlation>0,]
pathways_full_positive <- lapply(unique(dt_pos$Contrast),function(cont){
  library(gprofiler2)
  library(data.table)
   p<-gprofiler2::gost(signif = T ,unique(dt_pos[Contrast==cont,Gene]))[[1]]
   if(!is.null(p)){
    dth<- data.table::as.data.table(p)
    dth[,FDR:=p_value]
    dth[,TERM:=term_name]
    dth[,source:=factor(source)]
    dth[,Contrast:=cont]
   }
  })
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_Pos<-path_results(rbindlist(pathways_full_positive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_Pos,"data/pathway_full_positive_sig.rds")
dtable(Pathway_Full_Pos)

5.3.1.2 Negative:

dt_neg <- dt.sig[correlation<0,]
pathways_full_negitive <- lapply(unique(dt_neg$Contrast),function(cont){
  library(gprofiler2)
  library(data.table)
   p<-gprofiler2::gost(signif = T ,unique(dt_neg[Contrast==cont,Gene]))[[1]]
   if(!is.null(p)){
    dth<- data.table::as.data.table(p)
    dth[,FDR:=p_value]
    dth[,TERM:=term_name]
    dth[,source:=factor(source)]
    dth[,Contrast:=cont]
   }
  })
Pathway_Full_neg<-path_results(rbindlist(pathways_full_negitive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_neg,"data/pathway_full_negative_sig.rds")
dtable(Pathway_Full_neg)

–>

–>

–> –> –> –> –>

–>

–>

–>

–>

–> –> –> –> –>

–> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –>

–>

–>

–> –> –> –> –> –> –> –> –> –> –> –> –> –> –>

–>

–> –> –> –>

–>

–> –> –>